Each chunk represents a code window in the book. Please note that some chunks could not run properly or take a too much time to do it. The first line of these chunks is a comment, starting with the simbol # to clarify why is this.
library(mlbench)
data(PimaIndiansDiabetes2)
? PimaIndiansDiabetes2
head(PimaIndiansDiabetes2)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 NA 33.6 0.627 50 pos
## 2 1 85 66 29 NA 26.6 0.351 31 neg
## 3 8 183 64 NA NA 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 NA NA 25.6 0.201 30 neg
attach(PimaIndiansDiabetes2)
## The following object is masked from package:datasets:
##
## pressure
mean(c(4, 5, 8, 2, 8, 4, 5, 9, 2, 7))
## [1] 5.4
mean(mass, na.rm=TRUE)
## [1] 32.45746
sapply(PimaIndiansDiabetes2[, -9], mean, na.rm=TRUE)
## pregnant glucose pressure triceps insulin mass
## 3.8450521 121.6867628 72.4051842 29.1534196 155.5482234 32.4574637
## pedigree age
## 0.4718763 33.2408854
sales.per <- c(0.10, -0.05, 0.20, 0.05, -0.03)
sales <- 1 + sales.per
5000 * prod(sales)
## [1] 6385.995
5000 * (1.050151) ^ 5
## [1] 6385.998
5000 * (1.054) ^ 5
## [1] 6503.888
library(psych)
geometric.mean(sales) - 1
## [1] 0.05015091
23 / 10 + 23 / 30
## [1] 3.066667
46 / 20
## [1] 2.3
speeds = c(10, 30)
harmonic.mean(speeds, na.rm=TRUE)
## [1] 15
46 / 15
## [1] 3.066667
median(1 : 7)
## [1] 4
median(1 : 6)
## [1] 3.5
median(mass , na.rm=TRUE)
## [1] 32.3
mean(c(1 : 7, 30))
## [1] 7.25
median(c(1 : 7, 30))
## [1] 4.5
sapply(PimaIndiansDiabetes2[, -9], median, na.rm=TRUE)
## pregnant glucose pressure triceps insulin mass pedigree age
## 3.0000 117.0000 72.0000 29.0000 125.0000 32.3000 0.3725 29.0000
library(DescTools)
##
## Attaching package: 'DescTools'
## The following objects are masked from 'package:psych':
##
## AUC, ICC, SD
Mode(mass)
## [1] NA
## attr(,"freq")
## [1] NA
sapply(PimaIndiansDiabetes2, Mode, na.rm=TRUE)
## $pregnant
## [1] 1
## attr(,"freq")
## [1] 135
##
## $glucose
## [1] 99 100
## attr(,"freq")
## [1] 17
##
## $pressure
## [1] 70
## attr(,"freq")
## [1] 57
##
## $triceps
## [1] 32
## attr(,"freq")
## [1] 31
##
## $insulin
## [1] 105
## attr(,"freq")
## [1] 11
##
## $mass
## [1] 32
## attr(,"freq")
## [1] 13
##
## $pedigree
## [1] 0.254 0.258
## attr(,"freq")
## [1] 6
##
## $age
## [1] 22
## attr(,"freq")
## [1] 72
##
## $diabetes
## [1] neg
## attr(,"freq")
## [1] 500
## Levels: neg pos
# plot histogram
hist(mass , col="seashell", border="black", xlab="Body mass index", main="Pima Indians Diabetes")
# add mean
abline(v=mean(mass, na.rm=TRUE), col="red", lwd=2)
# add median
abline(v=median(mass, na.rm=TRUE), col="blue", lwd=2)
abline(v=Mode(mass), col="seagreen", lwd=2) # add mode
legend(x="topright", c("Mean", "Median", "Mode"),
col=c("red", "blue", "seagreen"), lwd=c(2, 2)) # legend
quantile(mass, probs=0.65, na.rm=TRUE)
## 65%
## 34.54
quantile(mass, na.rm=TRUE)
## 0% 25% 50% 75% 100%
## 18.2 27.5 32.3 36.6 67.1
quantile(mass, probs=seq(0, 1, 0.10), na.rm=TRUE)
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 18.20 24.00 26.20 28.40 30.34 32.30 33.80 35.50 37.88 41.62 67.10
percent.fun <- ecdf(mass)
percent.fun(40)
## [1] 0.8731836
percent.fun(median(mass , na.rm=TRUE))
## [1] 0.5019815
summary(mass)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.20 27.50 32.30 32.46 36.60 67.10 11
summary(PimaIndiansDiabetes2)
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## NA's :5 NA's :35 NA's :227
## insulin mass pedigree age diabetes
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00 neg:500
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00 pos:268
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## NA's :374 NA's :11
IQR(mass, na.rm=TRUE)
## [1] 9.1
sapply(PimaIndiansDiabetes2[, -9], IQR, na.rm=TRUE)
## pregnant glucose pressure triceps insulin mass pedigree age
## 5.0000 42.0000 16.0000 14.0000 113.7500 9.1000 0.3825 17.0000
boxplot(mass, xlab="Body mass index", col="papayawhip")
boxplot(pressure, xlab="Systolic blood pressure", col="pink")
boxplot(mass)$out
## [1] 53.2 55.0 67.1 52.3 52.3 52.9 59.4 57.3
boxplot(pressure)$out
## [1] 30 110 108 122 30 110 108 110 24 38 106 106 106 114
boxplot(mass, range=3, xlab="Body mass index", col="papayawhip")
boxplot(pressure, range=3, xlab="Systolic blood pressure", col="pink")
boxplot(mass , range=3)$out
## [1] 67.1
boxplot(pressure , range=3)$out
## numeric(0)
var.p <- function(x){
var(x, na.rm=TRUE) * (length(x) - 1) / (length(x))
}
sd.p <- function(x){
sqrt(var.p(x))
}
class1 <- c(5, 5, 5, 5, 5, 5, 5, 5, 5, 5)
class2 <- c(0, 0, 0, 0, 0, 10, 10, 10, 10, 10)
var.p(class1)
## [1] 0
var.p(class2)
## [1] 25
sd.p(class1)
## [1] 0
sd.p(class2)
## [1] 5
var.p(mass)
## [1] 47.89302
sd.p(mass)
## [1] 6.920478
sapply(PimaIndiansDiabetes2[, -9], sd.p)
## pregnant glucose pressure triceps insulin mass
## 3.3673836 30.5157546 12.3740943 10.4701592 118.6985020 6.9204784
## pedigree age
## 0.3311128 11.7525726
sd.p(mass) / mean(mass , na.rm=TRUE)
## [1] 0.2132169
sd.p(pressure) / mean(pressure , na.rm=TRUE)
## [1] 0.1709007
library(moments)
skewness(pressure, na.rm=TRUE)
## [1] 0.133878
skewness(pregnant, na.rm=TRUE)
## [1] 0.8999119
x <- c(rep(1, 100), rep(2, 140), rep(3, 25), rep(4, 15), rep(5, 10))
y <- c(rep(1, 100), rep(2, 140), rep(3, 30), rep(4, 20), rep(5, 15))
skewness(x)
## [1] 1.340999
skewness(y)
## [1] 1.214411
x <- c(rep(1, 30), rep(2, 40), rep(3, 250), rep(4, 400), rep(5, 150), rep(6, 120), rep(7, 10))
skewness(x)
## [1] 0
kurtosis(triceps, na.rm=TRUE)
## [1] 5.897361
sapply(PimaIndiansDiabetes2[,-9], FUN=kurtosis, na.rm=TRUE)
## pregnant glucose pressure triceps insulin mass pedigree age
## 3.150383 2.716919 3.896780 5.897361 9.274775 3.849771 8.550792 3.631177
salary1 <- c(10, 15, 20, 21, 22, 35, 35, 38, 60, 100)
salary2 <- c(21, 22, 23, 25, 32, 38, 42, 46, 46, 50)
Gini(salary1)
## [1] 0.3963795
Gini(salary2)
## [1] 0.1935588
Gini(pregnant, na.rm=TRUE)
## [1] 0.4808656
Gini(pressure, na.rm=TRUE)
## [1] 0.09480261
plot(Lc(pregnant), col="red", lwd=2, main="Lorentz curve for pregnant", xlab="Cumulative individuals", ylab="Cumulative values")
plot(Lc(pressure, na.rm=TRUE), col="blue", lwd=2, main="Lorentz curve for pressure", xlab="Cumulative individuals", ylab="Cumulative values")
library(ineq)
## Registered S3 methods overwritten by 'ineq':
## method from
## plot.Lc DescTools
## lines.Lc DescTools
##
## Attaching package: 'ineq'
## The following objects are masked from 'package:DescTools':
##
## Atkinson, Gini, Herfindahl, Lc, Rosenbluth
Theil(pregnant)
## [1] 0.2476443
Theil(pressure)
## [1] 0.01478803
class(mass)
## [1] "numeric"
class(pregnant)
## [1] "numeric"
sort(unique(pregnant))
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17
loaded <- data.frame(values=c(1, 2, 3, 4, 5, 6), probs=c(1 / 10, 1 / 10, 3 / 10, 2 / 10, 2 / 10, 1 / 10))
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(data=loaded , aes(x=values, y=probs)) +
geom_bar(stat="identity", fill="aquamarine3") +
ylim(c(0, 0.35))
ggplot(data=PimaIndiansDiabetes2, aes(x=mass)) +
geom_density() +
xlab("Mass") +
ylab("Density")
## Warning: Removed 11 rows containing non-finite values (stat_density).
ggplot(data=PimaIndiansDiabetes2, aes(x=mass)) +
stat_ecdf() +
xlab("Mass") +
ylab("Cumulative density")
## Warning: Removed 11 rows containing non-finite values (stat_ecdf).
pdf.density <- density(mass , na.rm=TRUE)
pdf.density$x[150]
## [1] 30.42878
pdf.density$y[150]
## [1] 0.05387463
ecdf.cumul <- ecdf(mass)
ecdf.cumul(35)
## [1] 0.677675
set.seed(5)
x <- 1 : 50
sample(x, size=10)
## [1] 2 43 15 11 41 21 30 7 19 3
set.seed(5)
sample(c("H", "T"), 25, replace=TRUE)
## [1] "T" "H" "H" "H" "H" "H" "H" "H" "T" "H" "H" "H" "H" "T" "T" "H" "H" "T" "T"
## [20] "T" "H" "T" "H" "H" "T"
set.seed(5)
sample(1 : 6, size=10, prob=c(0.1, 0.1, 0.1, 0.1, 0.1, 0.5), replace=TRUE)
## [1] 6 4 1 6 6 5 3 2 1 6
? Distributions
dbinom(2, size=5, prob=0.2)
## [1] 0.2048
pbinom(2, 5, 0.2, lower.tail=TRUE)
## [1] 0.94208
pbinom(2, 5, 0.2, lower.tail=FALSE)
## [1] 0.05792
qbinom(0.95, 5, 0.2)
## [1] 3
set.seed(5)
rbinom(10, 5, 0.2)
## [1] 0 1 2 0 0 1 1 2 3 0
values <- 0 : 5
plot(values, dbinom(values , 5, 0.2), type="b", xlab="Values", ylab="Probabilities", main="Mass")
plot(values, pbinom(values , 5, 0.2), type="b", xlab="Values", ylab="Probabilities", main="Cumulative")
values <- 0 : 11
plot(values, dpois(values, 5), type="b", xlab = "Values", ylab="Probabilities", main="Mass")
plot(ppois(values, 5), type="b", xlab = "Values", ylab="Probabilities", main="Cumulative")
pnorm(80, 70, 15) - pnorm(60, 70, 15)
## [1] 0.4950149
x <- seq(40, 100, by=0.5)
curve(dnorm(x, 70, 15), xlim=c(40, 100), col="blue", lwd=2, xlab="x", ylab="f(x)", main="Density function N(70,15)")
curve(pnorm(x, 70, 15), xlim=c(40, 100), col="blue", lwd=2, xlab="x", ylab="f(x)", main="Cumulative function N(70,15)")
set.seed(1)
sample <- rnorm(1000, 70, 15)
hist(sample, freq=FALSE, breaks=seq(20, 130, 10), col="seashell", xlab="Heart rate", main="Histogram for simulation of N(70,15)")
curve(dnorm(x, 70, 15), xlim=c(20, 130), col="blue", lwd=2, add=TRUE)
qqnorm(mass, main="Normal Q-Q plot for mass")
qqline(mass, col="blue", lwd=2) # adds a reference line
qqnorm(rnorm(1000, 70, 15), main="Normal Q-Q plot for N(70,15)")
qqline(rnorm(1000, 70, 15), col="blue", lwd=2)
m = 10000 # sample size of the final variable
lambda = 4 # poisson parameter
par(mfrow=c(2, 2))
for (n in c(1, 5, 10, 100)) {
x <- NULL #initialization of x
for (i in 1:m) {
x <- c(x, mean(rpois(n, lambda)))
}
x.normalized <- (x-lambda)/(sqrt(lambda/n))
hist(x.normalized , col="seashell",
main=paste("Mean of", n,"Po(4)"), xlab="Value")
}
n = 100
m = 10000
mu = mean(mass , na.rm=TRUE)
sigma = sd(mass , na.rm=TRUE)
x <- NULL
for (i in 1:m) {
x <- c(x, mean(sample(na.omit(mass), n)))
}
y = (x-mu)/(sigma/sqrt(n))
hist(y, col="seashell", main=paste(paste("Mean of 100",sep=" "),"mass",sep=" "), xlab="Value")
mean(triceps, na.rm=TRUE)
## [1] 29.15342
sd(triceps, na.rm=TRUE) ^ 2
## [1] 109.7672
conf.level <- 0.95
t <- qt((1 - conf.level) / 2, df=length(triceps) - 1, lower.tail = FALSE)
sigma <- sd(triceps , na.rm=TRUE)
mean <- mean(triceps , na.rm=TRUE)
sd.mean <- sigma / sqrt(length(triceps) - sum(is.na(triceps)))
interval <- c(mean - t * sd.mean, mean + t * sd.mean)
interval
## [1] 28.26918 30.03766
MeanCI(triceps, na.rm=TRUE, conf.level=0.95)
## mean lwr.ci upr.ci
## 29.15342 28.26859 30.03825
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
##
## logit
set.seed(1)
resampling <- boot(na.omit(triceps), function(x, i) mean(x[i]), R=10000)
boot.ci(resampling, conf = 0.95)
## Warning in boot.ci(resampling, conf = 0.95): bootstrap variances needed for
## studentized intervals
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = resampling, conf = 0.95)
##
## Intervals :
## Level Normal Basic
## 95% (28.28, 30.04 ) (28.26, 30.02 )
##
## Level Percentile BCa
## 95% (28.29, 30.05 ) (28.32, 30.08 )
## Calculations and Intervals on Original Scale
VarCI(mass, na.rm=TRUE, conf.level=0.95)
## var lwr.ci upr.ci
## 47.95546 43.46581 53.18227
t.test(pressure, mu=70, alternative="greater")
##
## One Sample t-test
##
## data: pressure
## t = 5.259, df = 732, p-value = 9.516e-08
## alternative hypothesis: true mean is greater than 70
## 95 percent confidence interval:
## 71.65196 Inf
## sample estimates:
## mean of x
## 72.40518
t.test(pressure, mu=72, alternative="two.sided")
##
## One Sample t-test
##
## data: pressure
## t = 0.88595, df = 732, p-value = 0.3759
## alternative hypothesis: true mean is not equal to 72
## 95 percent confidence interval:
## 71.50732 73.30305
## sample estimates:
## mean of x
## 72.40518
t.test(pressure, mu=71.4, alternative="two.sided")
##
## One Sample t-test
##
## data: pressure
## t = 2.1979, df = 732, p-value = 0.02827
## alternative hypothesis: true mean is not equal to 71.4
## 95 percent confidence interval:
## 71.50732 73.30305
## sample estimates:
## mean of x
## 72.40518
test <- t.test(pressure, mu=70, alternative="greater")
test$p.value
## [1] 9.515989e-08
t.test(insulin, glucose, mu=0, alternative="two.sided")
##
## Welch Two Sample t-test
##
## data: insulin and glucose
## t = 5.5647, df = 420.03, p-value = 4.688e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 21.90042 45.82250
## sample estimates:
## mean of x mean of y
## 155.5482 121.6868
t.test(insulin, glucose, mu=0, alternative="less")
##
## Welch Two Sample t-test
##
## data: insulin and glucose
## t = 5.5647, df = 420.03, p-value = 1
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf 43.89268
## sample estimates:
## mean of x mean of y
## 155.5482 121.6868
t.test(insulin, glucose, mu=30, alternative="two.sided")
##
## Welch Two Sample t-test
##
## data: insulin and glucose
## t = 0.63458, df = 420.03, p-value = 0.5261
## alternative hypothesis: true difference in means is not equal to 30
## 95 percent confidence interval:
## 21.90042 45.82250
## sample estimates:
## mean of x mean of y
## 155.5482 121.6868
VarTest(pressure, sigma.squared=153)
##
## One Sample Chi-Square test on variance
##
## data: pressure
## X-squared = 733.52, df = 732, p-value = 0.9544
## alternative hypothesis: true variance is not equal to 153
## 95 percent confidence interval:
## 138.7477 170.3223
## sample estimates:
## variance of x
## 153.3178
var.test(pressure, mass, ratio=3, alternative= "two.sided")
##
## F test to compare two variances
##
## data: pressure and mass
## F = 1.0657, num df = 732, denom df = 756, p-value = 0.3855
## alternative hypothesis: true ratio of variances is not equal to 3
## 95 percent confidence interval:
## 2.768995 3.691989
## sample estimates:
## ratio of variances
## 3.197088
library(snpar)
runs.test(c(1, 1, 1, 1, 1, 1, 1, 1, 0, 0))
##
## Approximate runs rest
##
## data: c(1, 1, 1, 1, 1, 1, 1, 1, 0, 0)
## Runs = 2, p-value = 0.01287
## alternative hypothesis: two.sided
set.seed(5)
sample.vector <- rnorm(100000, 0, 1)
binary.vector <- ifelse(sample.vector < 0, 0, 1)
runs.test(binary.vector)
##
## Approximate runs rest
##
## data: binary.vector
## Runs = 50172, p-value = 0.268
## alternative hypothesis: two.sided
ks.test(glucose, pressure)
## Warning in ks.test(glucose, pressure): p-value will be approximate in the
## presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: glucose and pressure
## D = 0.80399, p-value < 2.2e-16
## alternative hypothesis: two-sided
set.seed(5)
ks.test(rpois(100, 5), rpois(200, 5))
## Warning in ks.test(rpois(100, 5), rpois(200, 5)): p-value will be approximate in
## the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: rpois(100, 5) and rpois(200, 5)
## D = 0.07, p-value = 0.8996
## alternative hypothesis: two-sided
shapiro.test(insulin)
##
## Shapiro-Wilk normality test
##
## data: insulin
## W = 0.8041, p-value < 2.2e-16
set.seed(5)
shapiro.test(rnorm(5000, 0, 1))
##
## Shapiro-Wilk normality test
##
## data: rnorm(5000, 0, 1)
## W = 0.99961, p-value = 0.4446
ggplot(PimaIndiansDiabetes2, aes(x=mass, y=pressure)) +
geom_point()
## Warning: Removed 39 rows containing missing values (geom_point).
ggplot(PimaIndiansDiabetes2, aes(x=mass, y=triceps)) +
geom_point()
## Warning: Removed 229 rows containing missing values (geom_point).
ggplot(PimaIndiansDiabetes2, aes(x=mass, y=triceps)) +
geom_point() +
geom_vline(xintercept=mean(mass, na.rm=TRUE), colour="red", size=1.2) +
geom_hline(yintercept=mean(triceps, na.rm=TRUE), colour="blue", size=1.2) +
annotate("text", label="mean of mass", angle=90, x=34, y=75, size=4, colour="red") +
annotate("text", label="mean of triceps", x=55, y=25, size=4, colour="blue")
## Warning: Removed 229 rows containing missing values (geom_point).
cov(mass, pressure, use="pairwise.complete.obs")
## [1] 24.64499
cov(mass, triceps, use="pairwise.complete.obs")
## [1] 46.72566
cor(mass, pressure, use="pairwise.complete.obs")
## [1] 0.2892303
cor(mass, triceps, use="pairwise.complete.obs")
## [1] 0.6482139
cor(PimaIndiansDiabetes2[, -9], use="pairwise.complete.obs")
## pregnant glucose pressure triceps insulin mass
## pregnant 1.00000000 0.1281346 0.214178483 0.1002391 0.08217103 0.02171892
## glucose 0.12813455 1.0000000 0.223191778 0.2280432 0.58118621 0.23277051
## pressure 0.21417848 0.2231918 1.000000000 0.2268391 0.09827230 0.28923034
## triceps 0.10023907 0.2280432 0.226839067 1.0000000 0.18488842 0.64821394
## insulin 0.08217103 0.5811862 0.098272299 0.1848884 1.00000000 0.22805016
## mass 0.02171892 0.2327705 0.289230340 0.6482139 0.22805016 1.00000000
## pedigree -0.03352267 0.1372457 -0.002804527 0.1150164 0.13039507 0.15538175
## age 0.54434123 0.2671356 0.330107425 0.1668158 0.22026068 0.02584146
## pedigree age
## pregnant -0.033522673 0.54434123
## glucose 0.137245741 0.26713555
## pressure -0.002804527 0.33010743
## triceps 0.115016426 0.16681577
## insulin 0.130395072 0.22026068
## mass 0.155381746 0.02584146
## pedigree 1.000000000 0.03356131
## age 0.033561312 1.00000000
library(corrplot)
## corrplot 0.84 loaded
corrplot(cor(PimaIndiansDiabetes2[,-9], use="pairwise.complete.obs"))
plot(PimaIndiansDiabetes2)
model <- lm(triceps ~ mass, data=PimaIndiansDiabetes2)
model$coefficients
## (Intercept) mass
## -3.3734852 0.9894821
summary(model)
##
## Call:
## lm(formula = triceps ~ mass, data = PimaIndiansDiabetes2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.764 -5.068 -0.612 5.021 68.038
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.37349 1.68557 -2.001 0.0459 *
## mass 0.98948 0.05016 19.727 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.995 on 537 degrees of freedom
## (229 observations deleted due to missingness)
## Multiple R-squared: 0.4202, Adjusted R-squared: 0.4191
## F-statistic: 389.2 on 1 and 537 DF, p-value: < 2.2e-16
summary(model)$r.squared
## [1] 0.4201813
ggplot(PimaIndiansDiabetes2, aes(x=mass, y=triceps)) +
geom_point() +
geom_smooth(method="lm", se=TRUE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 229 rows containing non-finite values (stat_smooth).
## Warning: Removed 229 rows containing missing values (geom_point).
predict(model, data.frame(mass=30), interval="confidence")
## fit lwr upr
## 1 26.31098 25.5768 27.04515
library(psych)
pairs.panels(PimaIndiansDiabetes2[, -9],
method="pearson", hist.col="seashell",
density=TRUE , lm=TRUE)
model <- lm(triceps ~ mass + glucose + pressure)
summary(model)
##
## Call:
## lm(formula = triceps ~ mass + glucose + pressure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.128 -4.878 -0.649 4.540 66.358
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.20853 2.43688 -2.548 0.0111 *
## mass 0.95545 0.05410 17.662 <2e-16 ***
## glucose 0.02310 0.01171 1.972 0.0491 *
## pressure 0.01637 0.03003 0.545 0.5860
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.008 on 528 degrees of freedom
## (236 observations deleted due to missingness)
## Multiple R-squared: 0.4242, Adjusted R-squared: 0.4209
## F-statistic: 129.7 on 3 and 528 DF, p-value: < 2.2e-16
model <- lm(mass ~ ., data=PimaIndiansDiabetes2)
summary(model)
##
## Call:
## lm(formula = mass ~ ., data = PimaIndiansDiabetes2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.9216 -3.3893 -0.7394 3.0061 21.5470
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.112359 1.850813 8.165 4.69e-15 ***
## pregnant -0.228028 0.108476 -2.102 0.0362 *
## glucose -0.004484 0.011429 -0.392 0.6950
## pressure 0.103319 0.021873 4.724 3.26e-06 ***
## triceps 0.398051 0.025744 15.462 < 2e-16 ***
## insulin 0.005744 0.002644 2.172 0.0304 *
## pedigree 0.817712 0.760767 1.075 0.2831
## age -0.047715 0.036370 -1.312 0.1903
## diabetespos 1.576717 0.659531 2.391 0.0173 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.01 on 383 degrees of freedom
## (376 observations deleted due to missingness)
## Multiple R-squared: 0.5023, Adjusted R-squared: 0.4919
## F-statistic: 48.31 on 8 and 383 DF, p-value: < 2.2e-16
contrasts(diabetes)
## pos
## neg 0
## pos 1
model <- lm(mass ~ . -glucose - pedigree - age, data=PimaIndiansDiabetes2)
summary(model)
##
## Call:
## lm(formula = mass ~ . - glucose - pedigree - age, data = PimaIndiansDiabetes2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.1586 -3.3481 -0.6686 2.9879 21.5916
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.498506 1.526983 9.495 < 2e-16 ***
## pregnant -0.325957 0.082872 -3.933 9.93e-05 ***
## pressure 0.095618 0.021376 4.473 1.02e-05 ***
## triceps 0.400059 0.025548 15.659 < 2e-16 ***
## insulin 0.004908 0.002251 2.180 0.0298 *
## diabetespos 1.451142 0.595921 2.435 0.0153 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.01 on 386 degrees of freedom
## (376 observations deleted due to missingness)
## Multiple R-squared: 0.4983, Adjusted R-squared: 0.4918
## F-statistic: 76.69 on 5 and 386 DF, p-value: < 2.2e-16
linear.price <- lm(price ~ x, data=diamonds)
summary(linear.price)$adj.r.squared
## [1] 0.7822215
ggplot(diamonds, aes(y=price , x=x)) +
geom_point(alpha=.5) +
geom_smooth(method="lm") +
xlim(3, 11) +
ylim(0, 19000)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_smooth).
poly.price <- lm(price ~ poly(x, 2), data=diamonds)
summary(poly.price)$adj.r.squared
## [1] 0.8591606
ggplot(diamonds , aes(y=price , x=x)) +
geom_point(alpha=.5) +
geom_smooth(method="lm", formula=y ~ poly(x, 2)) +
xlim(3, 11) +
ylim(0, 19000)
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 22 rows containing missing values (geom_smooth).
linear.carat <- lm(x ~ carat , data=diamonds)
summary(linear.carat)$adj.r.squared
## [1] 0.9508078
ggplot(diamonds, aes(y=x, x=carat)) +
geom_point(alpha=.5) +
geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'
linear.carat <- lm(x ~ log(carat), data=diamonds)
summary(linear.carat)$adj.r.squared
## [1] 0.9804279
ggplot(diamonds, aes(y=x, x=carat)) +
geom_point(alpha=.5) +
geom_smooth(method="lm", formula=y ~ log(x))
contingency <- table(diabetes, age)
contingency
## age
## diabetes 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
## neg 58 61 31 38 34 25 24 25 16 15 11 7 7 10 5 6 13 6 9 7 9 11 2
## pos 5 11 7 8 14 8 8 10 13 6 13 9 10 4 5 10 6 10 3 6 13 7 11
## age
## diabetes 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
## neg 3 7 6 2 4 2 3 3 1 1 2 3 1 4 4 1 3 1 2 4 1 3 2
## pos 5 8 7 4 1 3 5 5 7 4 4 1 2 1 3 2 2 1 2 0 0 0 2
## age
## diabetes 67 68 69 70 72 81
## neg 2 1 2 0 1 1
## pos 1 0 0 1 0 0
library(vcd)
## Loading required package: grid
assocstats(contingency)
## X^2 df P(> X^2)
## Likelihood Ratio 150.06 51 1.0889e-11
## Pearson 140.94 51 2.3070e-10
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.394
## Cramer's V : 0.428
boxplot(mass ~ diabetes, data=PimaIndiansDiabetes2, ylab="Mass")
aov <- aov(mass ~ diabetes)
summary(aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## diabetes 1 3567 3567 82.4 <2e-16 ***
## Residuals 755 32687 43
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 11 observations deleted due to missingness
log.model <- glm(diabetes ~ ., data=PimaIndiansDiabetes2, family=binomial)
summary(log.model)
##
## Call:
## glm(formula = diabetes ~ ., family = binomial, data = PimaIndiansDiabetes2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7823 -0.6603 -0.3642 0.6409 2.5612
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.004e+01 1.218e+00 -8.246 < 2e-16 ***
## pregnant 8.216e-02 5.543e-02 1.482 0.13825
## glucose 3.827e-02 5.768e-03 6.635 3.24e-11 ***
## pressure -1.420e-03 1.183e-02 -0.120 0.90446
## triceps 1.122e-02 1.708e-02 0.657 0.51128
## insulin -8.253e-04 1.306e-03 -0.632 0.52757
## mass 7.054e-02 2.734e-02 2.580 0.00989 **
## pedigree 1.141e+00 4.274e-01 2.669 0.00760 **
## age 3.395e-02 1.838e-02 1.847 0.06474 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 498.10 on 391 degrees of freedom
## Residual deviance: 344.02 on 383 degrees of freedom
## (376 observations deleted due to missingness)
## AIC: 362.02
##
## Number of Fisher Scoring iterations: 5
predictions <- predict(log.model , type="response")
head(predictions)
## 4 5 7 9 14 15
## 0.02711452 0.89756363 0.03763559 0.85210016 0.79074609 0.71308818
fact.predictions <- rep("neg", length(diabetes))
fact.predictions[predictions > 0.5] <- "pos"
table(fact.predictions , diabetes)
## diabetes
## fact.predictions neg pos
## neg 378 189
## pos 122 79
mean(fact.predictions == diabetes, na.rm=TRUE)
## [1] 0.5950521
fact.predictions <- rep("neg", length(diabetes))
fact.predictions[predictions > 0.6]="pos"
table(fact.predictions, diabetes)
## diabetes
## fact.predictions neg pos
## neg 400 198
## pos 100 70
mean(fact.predictions == diabetes, na.rm=TRUE)
## [1] 0.6119792
log.model <- glm(diabetes ~ glucose, data=PimaIndiansDiabetes2, family=binomial)
new.data <- data.frame(glucose=138, mass=35.5, pedigree=0.325)
predict(log.model, newdata=new.data, type="response")
## 1
## 0.473114
##
## Attaching package: 'data.table'
## The following object is masked from 'package:DescTools':
##
## %like%
summary(flights)
## icao24 callsign country longitude
## Length:3822909 Length:3822909 Length:3822909 Min. :-163.42
## Class :character Class :character Class :character 1st Qu.: -89.98
## Mode :character Mode :character Mode :character Median : -12.82
## Mean : -22.12
## 3rd Qu.: 24.10
## Max. : 179.07
## latitude altitude.baro velocity track
## Min. :-57.13 Min. : -960.1 Min. : 0.0 Min. :-172.63
## 1st Qu.: 31.73 1st Qu.: 3794.8 1st Qu.: 151.2 1st Qu.: 92.07
## Median : 38.40 Median : 9448.8 Median : 206.9 Median : 191.10
## Mean : 33.98 Mean : 7668.4 Mean : 189.8 Mean : 183.47
## 3rd Qu.: 44.94 3rd Qu.:10972.8 3rd Qu.: 236.3 3rd Qu.: 270.83
## Max. : 88.50 Max. :38648.6 Max. :2384.5 Max. : 359.90
## vertical.rate time
## Min. :-1541.140 Min. : 0
## 1st Qu.: -0.650 1st Qu.:203400
## Median : 0.000 Median :395100
## Mean : -0.193 Mean :391961
## 3rd Qu.: 0.330 3rd Qu.:585890
## Max. : 7764.780 Max. :777600
library(corrplot)
corrplot(cor(flights[, -c(1, 2, 3)], use="pairwise.complete.obs"))
library(vcd)
assocstats(table(flights$callsign, flights$country))
## X^2 df P(> X^2)
## Likelihood Ratio 20977515 21686635 1
## Pearson 532547665 21686635 0
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.996
## Cramer's V : 0.98
model.velocity <- lm(velocity ~ altitude.baro, data=flights)
summary(model.velocity)
##
## Call:
## lm(formula = velocity ~ altitude.baro, data = flights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -565.89 -23.36 -0.06 24.00 2150.63
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.749e+01 3.967e-02 2205 <2e-16 ***
## altitude.baro 1.334e-02 4.564e-06 2922 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36.53 on 3822907 degrees of freedom
## Multiple R-squared: 0.6908, Adjusted R-squared: 0.6908
## F-statistic: 8.54e+06 on 1 and 3822907 DF, p-value: < 2.2e-16
model.velocity <- lm(velocity ~ longitude + latitude + altitude.baro, data=flights)
summary(model.velocity)
##
## Call:
## lm(formula = velocity ~ longitude + latitude + altitude.baro,
## data = flights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -567.15 -23.60 -0.88 23.77 2146.62
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.695e+01 4.805e-02 1809.75 <2e-16 ***
## longitude 5.542e-02 2.456e-04 225.60 <2e-16 ***
## latitude 6.625e-02 9.723e-04 68.14 <2e-16 ***
## altitude.baro 1.327e-02 4.573e-06 2902.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36.29 on 3822905 degrees of freedom
## Multiple R-squared: 0.6948, Adjusted R-squared: 0.6948
## F-statistic: 2.901e+06 on 3 and 3822905 DF, p-value: < 2.2e-16
model.velocity <- lm(velocity ~ altitude.baro + track, data=flights)
summary(model.velocity)
##
## Call:
## lm(formula = velocity ~ altitude.baro + track, data = flights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -584.87 -20.64 0.89 22.42 2146.45
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.082e+02 4.977e-02 2173.2 <2e-16 ***
## altitude.baro 1.329e-02 4.340e-06 3063.4 <2e-16 ***
## track -1.110e-01 1.743e-04 -636.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.73 on 3822906 degrees of freedom
## Multiple R-squared: 0.7204, Adjusted R-squared: 0.7204
## F-statistic: 4.926e+06 on 2 and 3822906 DF, p-value: < 2.2e-16
model.vertical <- lm(vertical.rate ~ longitude + latitude + altitude.baro + velocity, data=flights)
summary(model.vertical)
##
## Call:
## lm(formula = vertical.rate ~ longitude + latitude + altitude.baro +
## velocity, data = flights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1541.0 -0.6 0.0 0.4 7764.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.134e+00 2.005e-02 -56.544 < 2e-16 ***
## longitude -3.130e-04 7.573e-05 -4.133 3.59e-05 ***
## latitude -1.871e-03 2.980e-04 -6.280 3.40e-10 ***
## altitude.baro -2.266e-06 2.507e-06 -0.904 0.366
## velocity 5.347e-03 1.566e-04 34.134 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.11 on 3822904 degrees of freedom
## Multiple R-squared: 0.0009354, Adjusted R-squared: 0.0009343
## F-statistic: 894.8 on 4 and 3822904 DF, p-value: < 2.2e-16